// evmone: Fast Ethereum Virtual Machine implementation
// Copyright 2022 The Silkworm & evmone Authors.
// SPDX-License-Identifier: Apache-2.0

/// @file
/// SHA256 implementation.
/// Initial version copied from the Silkworm project (https://github.com/erigontech/silkworm).
/// Based on several bits of code released to public domain:
/// https://github.com/amosnier/sha-2 (Author: Alain Mosnier)
/// https://github.com/noloader/SHA-Intrinsics (Author: Jeffrey Walton)
/// https://github.com/Mysticial/FeatureDetector (Author: Alexander Yee)

#include "sha256.hpp"
#include <bit>
#include <cstdint>
#include <cstring>

#if defined(__x86_64__)  // NOLINT(readability-use-concise-preprocessor-directives)

#include <cpuid.h>
#include <x86intrin.h>

#elif defined(__aarch64__) && defined(__APPLE__)

#include <arm_neon.h>

#if defined(__linux__)
#include <asm/hwcap.h>
#include <sys/auxv.h>
#elif defined(__APPLE__)
#include <sys/sysctl.h>
#endif  // defined(__linux__), defined(__APPLE__)

#endif  // defined(__x86_64__), defined(__aarch64__)

namespace evmone::crypto
{

#define CHUNK_SIZE 64
#define TOTAL_LEN_LEN 8

/*
 * Comments from pseudo-code at https://en.wikipedia.org/wiki/SHA-2 are reproduced here.
 * When useful for clarification, portions of the pseudo-code are reproduced here too.
 */

/*
 * Initialize array of round constants:
 * (first 32 bits of the fractional parts of the cube roots of the first 64 primes 2..311):
 */
static const uint32_t k[] = {0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
    0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe,
    0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa,
    0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
    0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb,
    0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624,
    0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
    0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb,
    0xbef9a3f7, 0xc67178f2};

struct BufferState
{
    const std::byte* p;
    size_t len;
    size_t total_len;
    bool single_one_delivered = false;
    bool total_len_delivered = false;

    constexpr BufferState(const std::byte* input, size_t size)
      : p{input}, len{size}, total_len{size}
    {}
};

static bool calc_chunk(uint8_t chunk[CHUNK_SIZE], struct BufferState* state)
{
    if (state->total_len_delivered)
    {
        return false;
    }

    if (state->len >= CHUNK_SIZE)
    {
        memcpy(chunk, state->p, CHUNK_SIZE);
        state->p += CHUNK_SIZE;
        state->len -= CHUNK_SIZE;
        return true;
    }

    size_t space_in_chunk = CHUNK_SIZE - state->len;
    if (state->len != 0)
    {  // avoid adding 0 to nullptr
        memcpy(chunk, state->p, state->len);
        chunk += state->len;
        state->p += state->len;
    }
    state->len = 0;

    /* If we are here, space_in_chunk is one at minimum. */
    if (!state->single_one_delivered)
    {
        *chunk++ = 0x80;
        space_in_chunk -= 1;
        state->single_one_delivered = true;
    }

    /*
     * Now:
     * - either there is enough space left for the total length, and we can conclude,
     * - or there is too little space left, and we have to pad the rest of this chunk with zeroes.
     * In the latter case, we will conclude at the next invocation of this function.
     */
    if (space_in_chunk >= TOTAL_LEN_LEN)
    {
        const size_t left = space_in_chunk - TOTAL_LEN_LEN;
        size_t len = state->total_len;
        int i = 0;
        memset(chunk, 0x00, left);
        chunk += left;

        /* Storing of len * 8 as a big endian 64-bit without overflow. */
        chunk[7] = (uint8_t)(len << 3);
        len >>= 5;
        for (i = 6; i >= 0; i--)
        {
            chunk[i] = (uint8_t)len;
            len >>= 8;
        }
        state->total_len_delivered = true;
    }
    else
    {
        memset(chunk, 0x00, space_in_chunk);
    }

    return true;
}

[[gnu::always_inline, msvc::forceinline]] static void sha_256_implementation(
    uint32_t h[8], const std::byte* input, size_t len)
{
    /*
     * Note 1: All integers (expect indexes) are 32-bit unsigned integers and addition is calculated
     * modulo 2^32.
     *
     * Note 2: For each round, there is one round constant k[i] and one entry in the message
     * schedule array w[i], 0 = i = 63
     *
     * Note 3: The compression function uses 8 working variables, a through h
     *
     * Note 4: Big-endian convention is used when expressing the constants in this pseudocode,
     *     and when parsing message block data from bytes to words, for example,
     *     the first word of the input message "abc" after padding is 0x61626380
     */

    BufferState state{input, len};

    /* 512-bit chunks is what we will operate on. */
    uint8_t chunk[CHUNK_SIZE];

    while (calc_chunk(chunk, &state))
    {
        unsigned i = 0;
        unsigned j = 0;

        uint32_t ah[8];
        /* Initialize working variables to current hash value: */
        for (i = 0; i < 8; i++)
        {
            ah[i] = h[i];
        }

        const uint8_t* p = chunk;

        /*
         * The w-array is really w[64], but since we only need 16 of them at a time, we save stack
         * by calculating 16 at a time.
         *
         * This optimization was not there initially and the rest of the comments about w[64] are
         * kept in their initial state.
         */

        /*
         * create a 64-entry message schedule array w[0..63] of 32-bit words (The initial values in
         * w[0..63] don't matter, so many implementations zero them here) copy chunk into first 16
         * words w[0..15] of the message schedule array
         */
        uint32_t w[16];

        /* Compression function main loop: */
        for (i = 0; i < 4; i++)
        {
            for (j = 0; j < 16; j++)
            {
                if (i == 0)
                {
                    w[j] = (uint32_t)p[0] << 24 | (uint32_t)p[1] << 16 | (uint32_t)p[2] << 8 |
                           (uint32_t)p[3];
                    p += 4;
                }
                else
                {
                    /* Extend the first 16 words into the remaining 48 words w[16..63] of the
                     * message schedule array: */
                    const uint32_t s0 = std::rotr(w[(j + 1) & 0xf], 7) ^
                                        std::rotr(w[(j + 1) & 0xf], 18) ^ (w[(j + 1) & 0xf] >> 3);
                    const uint32_t s1 = std::rotr(w[(j + 14) & 0xf], 17) ^
                                        std::rotr(w[(j + 14) & 0xf], 19) ^
                                        (w[(j + 14) & 0xf] >> 10);
                    w[j] = w[j] + s0 + w[(j + 9) & 0xf] + s1;
                }
                const uint32_t s1 =
                    std::rotr(ah[4], 6) ^ std::rotr(ah[4], 11) ^ std::rotr(ah[4], 25);
                const uint32_t ch = (ah[4] & ah[5]) ^ (~ah[4] & ah[6]);
                const uint32_t temp1 = ah[7] + s1 + ch + k[i << 4 | j] + w[j];
                const uint32_t s0 =
                    std::rotr(ah[0], 2) ^ std::rotr(ah[0], 13) ^ std::rotr(ah[0], 22);
                const uint32_t maj = (ah[0] & ah[1]) ^ (ah[0] & ah[2]) ^ (ah[1] & ah[2]);
                const uint32_t temp2 = s0 + maj;

                ah[7] = ah[6];
                ah[6] = ah[5];
                ah[5] = ah[4];
                ah[4] = ah[3] + temp1;
                ah[3] = ah[2];
                ah[2] = ah[1];
                ah[1] = ah[0];
                ah[0] = temp1 + temp2;
            }
        }

        /* Add the compressed chunk to the current hash value: */
        for (i = 0; i < 8; i++)
        {
            h[i] += ah[i];
        }
    }
}

static void sha_256_generic(uint32_t h[8], const std::byte* input, size_t len)
{
    sha_256_implementation(h, input, len);
}

static void (*sha_256_best)(uint32_t h[8], const std::byte* input, size_t len) = sha_256_generic;

#if defined(__x86_64__)  // NOLINT(readability-use-concise-preprocessor-directives)

__attribute__((target("bmi,bmi2"))) static void sha_256_x86_bmi(
    uint32_t h[8], const std::byte* input, size_t len)
{
    sha_256_implementation(h, input, len);
}

[[gnu::always_inline]] static __m128i set(uint64_t a, uint64_t b) noexcept
{
    // NOLINTNEXTLINE(*-runtime-int)
    return _mm_set_epi64x(static_cast<long long>(a), static_cast<long long>(b));
}

// The following function was adapted from
// https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c
/*   Intel SHA extensions using C intrinsics               */
/*   Written and place in public domain by Jeffrey Walton  */
/*   Based on code from Intel, and by Sean Gulley for      */
/*   the miTLS project.                                    */
__attribute__((target("sha,sse4.1"))) static void sha_256_x86_sha(
    uint32_t h[8], const std::byte* input, size_t len)
{
    // NOLINTBEGIN(readability-isolate-declaration)
    __m128i STATE0, STATE1;
    __m128i MSG, TMP;
    __m128i MSG0, MSG1, MSG2, MSG3;
    __m128i ABEF_SAVE, CDGH_SAVE;
    // NOLINTEND(readability-isolate-declaration)

    const __m128i MASK = set(0x0c0d0e0f08090a0b, 0x0405060700010203);

    // NOLINTBEGIN(cppcoreguidelines-pro-type-cstyle-cast)
    // NOLINTBEGIN(portability-simd-intrinsics)
    /* Load initial values */
    TMP = _mm_loadu_si128((const __m128i*)&h[0]);
    STATE1 = _mm_loadu_si128((const __m128i*)&h[4]);

    TMP = _mm_shuffle_epi32(TMP, 0xB1);          /* CDAB */
    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B);    /* EFGH */
    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8);    /* ABEF */
    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */

    BufferState state{input, len};

    /* 512-bit chunks is what we will operate on. */
    uint8_t chunk[CHUNK_SIZE];

    while (calc_chunk(chunk, &state))
    {
        /* Save current state */
        ABEF_SAVE = STATE0;
        CDGH_SAVE = STATE1;

        /* Rounds 0-3 */
        MSG = _mm_loadu_si128((const __m128i*)(chunk + 0));
        MSG0 = _mm_shuffle_epi8(MSG, MASK);
        MSG = _mm_add_epi32(MSG0, set(0xE9B5DBA5B5C0FBCF, 0x71374491428A2F98));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

        /* Rounds 4-7 */
        MSG1 = _mm_loadu_si128((const __m128i*)(chunk + 16));
        MSG1 = _mm_shuffle_epi8(MSG1, MASK);
        MSG = _mm_add_epi32(MSG1, set(0xAB1C5ED5923F82A4, 0x59F111F13956C25B));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

        /* Rounds 8-11 */
        MSG2 = _mm_loadu_si128((const __m128i*)(chunk + 32));
        MSG2 = _mm_shuffle_epi8(MSG2, MASK);
        MSG = _mm_add_epi32(MSG2, set(0x550C7DC3243185BE, 0x12835B01D807AA98));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

        /* Rounds 12-15 */
        MSG3 = _mm_loadu_si128((const __m128i*)(chunk + 48));
        MSG3 = _mm_shuffle_epi8(MSG3, MASK);
        MSG = _mm_add_epi32(MSG3, set(0xC19BF1749BDC06A7, 0x80DEB1FE72BE5D74));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
        MSG0 = _mm_add_epi32(MSG0, TMP);
        MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

        /* Rounds 16-19 */
        MSG = _mm_add_epi32(MSG0, set(0x240CA1CC0FC19DC6, 0xEFBE4786E49B69C1));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
        MSG1 = _mm_add_epi32(MSG1, TMP);
        MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);

        /* Rounds 20-23 */
        MSG = _mm_add_epi32(MSG1, set(0x76F988DA5CB0A9DC, 0x4A7484AA2DE92C6F));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
        MSG2 = _mm_add_epi32(MSG2, TMP);
        MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

        /* Rounds 24-27 */
        MSG = _mm_add_epi32(MSG2, set(0xBF597FC7B00327C8, 0xA831C66D983E5152));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
        MSG3 = _mm_add_epi32(MSG3, TMP);
        MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

        /* Rounds 28-31 */
        MSG = _mm_add_epi32(MSG3, set(0x1429296706CA6351, 0xD5A79147C6E00BF3));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
        MSG0 = _mm_add_epi32(MSG0, TMP);
        MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

        /* Rounds 32-35 */
        MSG = _mm_add_epi32(MSG0, set(0x53380D134D2C6DFC, 0x2E1B213827B70A85));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
        MSG1 = _mm_add_epi32(MSG1, TMP);
        MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);

        /* Rounds 36-39 */
        MSG = _mm_add_epi32(MSG1, set(0x92722C8581C2C92E, 0x766A0ABB650A7354));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
        MSG2 = _mm_add_epi32(MSG2, TMP);
        MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

        /* Rounds 40-43 */
        MSG = _mm_add_epi32(MSG2, set(0xC76C51A3C24B8B70, 0xA81A664BA2BFE8A1));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
        MSG3 = _mm_add_epi32(MSG3, TMP);
        MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

        /* Rounds 44-47 */
        MSG = _mm_add_epi32(MSG3, set(0x106AA070F40E3585, 0xD6990624D192E819));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
        MSG0 = _mm_add_epi32(MSG0, TMP);
        MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

        /* Rounds 48-51 */
        MSG = _mm_add_epi32(MSG0, set(0x34B0BCB52748774C, 0x1E376C0819A4C116));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
        MSG1 = _mm_add_epi32(MSG1, TMP);
        MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
        MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);

        /* Rounds 52-55 */
        MSG = _mm_add_epi32(MSG1, set(0x682E6FF35B9CCA4F, 0x4ED8AA4A391C0CB3));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
        MSG2 = _mm_add_epi32(MSG2, TMP);
        MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

        /* Rounds 56-59 */
        MSG = _mm_add_epi32(MSG2, set(0x8CC7020884C87814, 0x78A5636F748F82EE));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
        MSG3 = _mm_add_epi32(MSG3, TMP);
        MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

        /* Rounds 60-63 */
        MSG = _mm_add_epi32(MSG3, set(0xC67178F2BEF9A3F7, 0xA4506CEB90BEFFFA));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
        MSG = _mm_shuffle_epi32(MSG, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

        /* Combine state  */
        STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
        STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
    }

    TMP = _mm_shuffle_epi32(STATE0, 0x1B);       /* FEBA */
    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1);    /* DCHG */
    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */
    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8);    /* ABEF */

    /* Save state */
    _mm_storeu_si128((__m128i*)&h[0], STATE0);
    _mm_storeu_si128((__m128i*)&h[4], STATE1);
    // NOLINTEND(portability-simd-intrinsics)
    // NOLINTEND(cppcoreguidelines-pro-type-cstyle-cast)
}

// https://stackoverflow.com/questions/6121792/how-to-check-if-a-cpu-supports-the-sse3-instruction-set
static void cpuid(int info[4], int InfoType)  // NOLINT(readability-non-const-parameter)
{
    __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
}

__attribute__((constructor)) static void select_sha256_implementation()
{
    int info[4];
    cpuid(info, 0);
    const int nIds = info[0];

    bool hw_sse41 = false;
    bool hw_bmi1 = false;
    bool hw_bmi2 = false;
    bool hw_sha = false;

    if (nIds >= 0x00000001)
    {
        cpuid(info, 0x00000001);
        hw_sse41 = (info[2] & (1 << 19)) != 0;
    }
    if (nIds >= 0x00000007)
    {
        cpuid(info, 0x00000007);
        hw_bmi1 = (info[1] & (1 << 3)) != 0;
        hw_bmi2 = (info[1] & (1 << 8)) != 0;
        hw_sha = (info[1] & (1 << 29)) != 0;
    }

    if (hw_sse41 && hw_sha)
    {
        sha_256_best = sha_256_x86_sha;
    }
    else if (hw_bmi1 && hw_bmi2)
    {
        sha_256_best = sha_256_x86_bmi;
    }
}

#elif defined(__aarch64__) && defined(__APPLE__)

// The following function was adapted from
// https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c
/* sha256-arm.c - ARMv8 SHA extensions using C intrinsics     */
/*   Written and placed in public domain by Jeffrey Walton    */
/*   Based on code from ARM, and by Johannes Schneiders, Skip */
/*   Hovsmith and Barry O'Rourke for the mbedTLS project.     */
static void sha_256_arm_v8(uint32_t h[8], const std::byte* input, size_t len)
{
    uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
    uint32x4_t TMP0, TMP1, TMP2;

    /* Load state */
    STATE0 = vld1q_u32(&h[0]);
    STATE1 = vld1q_u32(&h[4]);

    BufferState state{input, len};

    /* 512-bit chunks is what we will operate on. */
    uint8_t chunk[CHUNK_SIZE];

    while (calc_chunk(chunk, &state))
    {
        /* Save state */
        ABEF_SAVE = STATE0;
        CDGH_SAVE = STATE1;

        /* Load message */
        MSG0 = vld1q_u32((const uint32_t*)(chunk + 0));
        MSG1 = vld1q_u32((const uint32_t*)(chunk + 16));
        MSG2 = vld1q_u32((const uint32_t*)(chunk + 32));
        MSG3 = vld1q_u32((const uint32_t*)(chunk + 48));

        /* Reverse for little endian */
        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));

        TMP0 = vaddq_u32(MSG0, vld1q_u32(&k[0x00]));

        /* Rounds 0-3 */
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
        TMP2 = STATE0;
        TMP1 = vaddq_u32(MSG1, vld1q_u32(&k[0x04]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);

        /* Rounds 4-7 */
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
        TMP2 = STATE0;
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&k[0x08]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);

        /* Rounds 8-11 */
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
        TMP2 = STATE0;
        TMP1 = vaddq_u32(MSG3, vld1q_u32(&k[0x0c]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);

        /* Rounds 12-15 */
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
        TMP2 = STATE0;
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&k[0x10]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);

        /* Rounds 16-19 */
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
        TMP2 = STATE0;
        TMP1 = vaddq_u32(MSG1, vld1q_u32(&k[0x14]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);

        /* Rounds 20-23 */
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
        TMP2 = STATE0;
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&k[0x18]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);

        /* Rounds 24-27 */
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
        TMP2 = STATE0;
        TMP1 = vaddq_u32(MSG3, vld1q_u32(&k[0x1c]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);

        /* Rounds 28-31 */
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
        TMP2 = STATE0;
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&k[0x20]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);

        /* Rounds 32-35 */
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
        TMP2 = STATE0;
        TMP1 = vaddq_u32(MSG1, vld1q_u32(&k[0x24]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);

        /* Rounds 36-39 */
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
        TMP2 = STATE0;
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&k[0x28]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);

        /* Rounds 40-43 */
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
        TMP2 = STATE0;
        TMP1 = vaddq_u32(MSG3, vld1q_u32(&k[0x2c]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);

        /* Rounds 44-47 */
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
        TMP2 = STATE0;
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&k[0x30]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);

        /* Rounds 48-51 */
        TMP2 = STATE0;
        TMP1 = vaddq_u32(MSG1, vld1q_u32(&k[0x34]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);

        /* Rounds 52-55 */
        TMP2 = STATE0;
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&k[0x38]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);

        /* Rounds 56-59 */
        TMP2 = STATE0;
        TMP1 = vaddq_u32(MSG3, vld1q_u32(&k[0x3c]));
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);

        /* Rounds 60-63 */
        TMP2 = STATE0;
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);

        /* Combine state */
        STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
        STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
    }

    /* Save state */
    vst1q_u32(&h[0], STATE0);
    vst1q_u32(&h[4], STATE1);
}

__attribute__((constructor)) static void select_sha256_implementation(void)
{
#if defined(__linux__)
    if ((getauxval(AT_HWCAP) & HWCAP_SHA2) != 0)
    {
        sha_256_best = sha_256_arm_v8;
    }
#elif defined(__APPLE__)
    int64_t hw_cap = 0;
    size_t size = sizeof(hw_cap);

    if (sysctlbyname("hw.optional.armv8_2_sha3", &hw_cap, &size, NULL, 0) == 0)
    {
        // Use SHA3 as proxy for SHA2 (sysctl hw doesn't list SHA2 for Apple M1)
        if (hw_cap == 1)
        {
            sha_256_best = sha_256_arm_v8;
        }
    }
#endif  // defined(__linux__), defined(__APPLE__)
}

#endif  // defined(__x86_64__), defined(__aarch64__)

/*
 * Limitations:
 * - Since input is a pointer in RAM, the data to hash should be in RAM, which could be a problem
 *   for large data sizes.
 * - SHA algorithms theoretically operate on bit strings. However, this implementation has no
 * support for bit string lengths that are not multiples of eight, and it really operates on arrays
 * of bytes. In particular, the len parameter is a number of bytes.
 */
void sha256(std::byte hash[SHA256_HASH_SIZE], const std::byte* data, size_t size)
{
    /*
     * Initialize hash values:
     * (first 32 bits of the fractional parts of the square roots of the first 8 primes 2..19):
     */
    uint32_t h[] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c,
        0x1f83d9ab, 0x5be0cd19};

    sha_256_best(h, data, size);

    /* Produce the final hash value (big-endian): */
    for (unsigned i = 0, j = 0; i < 8; i++)
    {
        hash[j++] = static_cast<std::byte>(h[i] >> 24);
        hash[j++] = static_cast<std::byte>(h[i] >> 16);
        hash[j++] = static_cast<std::byte>(h[i] >> 8);
        hash[j++] = static_cast<std::byte>(h[i]);
    }
}

}  // namespace evmone::crypto
